package is2.mtag;


import is2.data.Cluster;
import is2.data.F2SF;
import is2.data.Instances;
import is2.data.InstancesTagger;
import is2.data.Long2Int;
import is2.data.Long2IntInterface;
import is2.data.ParametersFloat;
import is2.data.PipeGen;
import is2.data.SentenceData09;
import is2.io.CONLLReader09;
import is2.tools.IPipe;
import is2.util.OptionsSuper;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;


final public class ExtractorM extends PipeGen implements IPipe {

	public  static int _CEND;


	private static final String STWRD = "STWRD",STPOS = "STPOS",END = "END",STR = "STR";

	public String[] types;

	Cluster cl;

	final public MFO mf =new MFO();
	public Long2IntInterface li;



	final MFO.Data4 d1 = new MFO.Data4(),d2 = new MFO.Data4(),d3 = new MFO.Data4(),dw = new MFO.Data4();
	final MFO.Data4 dwp = new MFO.Data4(),dp = new MFO.Data4();


	private OptionsSuper options;
	private int _ewrd;
	static private int _mid, _strp,_endp;

	public ExtractorM (Options options, Long2Int long2Int) throws IOException {
		this.options = options;

		li =long2Int;
	}

	public ExtractorM (OptionsSuper options) {
		this.options = options;
	}


	public HashMap<Integer,Integer> form2morph = new HashMap<Integer, Integer> ();

	
	public Instances createInstances(String file) {

		CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE);
		
		depReader.startReading(file);
		mf.register(POS,"<root-POS>");

		mf.register(FFEATS, CONLLReader09.NO_TYPE);
		mf.register(FFEATS, "");
		
		InstancesTagger is = new InstancesTagger();

		System.out.println("Registering feature parts ");

		HashMap<String,HashSet<String>> op2form = new HashMap<String, HashSet<String>> ();
		HashMap<String,Integer> freq = new HashMap<String, Integer> ();
		
		
		int ic=0;
		while(true) {
			SentenceData09 instance1 = depReader.getNext();
			if (instance1== null) break;
			ic++;


			String[] w = instance1.forms;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1]);
			for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR,  w[i1]);
			
			for(int i1 = 0; i1 < w.length; i1++) {
				mf.register(WORD,  w[i1].toLowerCase());
				Integer f = freq.get(w[i1].toLowerCase());
			
				if (f==null) freq.put(w[i1].toLowerCase(), 1);
				else freq.put(w[i1].toLowerCase(), f+1);

				HashSet<String> forms = op2form.get(w[i1].toLowerCase());
				if (forms==null) {
					forms = new HashSet<String>();
					op2form.put(w[i1].toLowerCase(), forms);
				}
				forms.add(instance1.ofeats[i1]==null?"_":instance1.ofeats[i1]);
			}
			for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR,  w[i1].toLowerCase());

			w = instance1.plemmas;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1]);			
			for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR,  w[i1]);

			w = instance1.ppos;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(POS,  w[i1]);

			w = instance1.gpos;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(POS,  w[i1]);

			w = instance1.ofeats;
			for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT,  w[i1]);
			for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FFEATS,  w[i1]);
 
		//	w = instance1.pfeats;
			//for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT,  w[i1]);
		}

		
		for(Entry<String,HashSet<String>> e : op2form.entrySet()) {
				if (e.getValue().size()==1 &&freq.get(e.getKey())>10) {
				//	System.out.println("found map "+e.getKey()+" "+e.getValue()+" "+freq.get(e.getKey()));
					form2morph.put(mf.getValue(ExtractorM.WORD, e.getKey()), mf.getValue(FFEATS, (String)e.getValue().toArray()[0]));
				}
 		}
		
		initFeatures();

		mf.calculateBits();
		initValues();

		System.out.println(""+mf.toString());

		depReader.startReading(file);

		int num1 = 0;
		long start1 = System.currentTimeMillis();
		
		System.out.print("Creating Features: ");
		is.init(ic, mf) ;
		int del=0;

		while(true) {
			if (num1 % 100 ==0) {del = outValue(num1, del);}
			SentenceData09 instance1 = depReader.getNext(is);
			if (instance1== null) break;

			if (num1>options.count) break;

			num1++;
		}
		long end1 = System.currentTimeMillis();
		System.gc();
		long mem2 = Runtime.getRuntime().totalMemory() -  Runtime.getRuntime().freeMemory();
		System.out.print("  time "+(end1-start1)+" mem "+(mem2/1024)+" kb");

		types = new String[mf.getFeatureCounter().get(FFEATS)];

		for(Entry<String,Integer> e : mf.getFeatureSet().get(FFEATS).entrySet()) {
			types[e.getValue()] = e.getKey();
		}

		
		if (options.clusterFile==null)cl = new Cluster();
		else cl=  new Cluster(options.clusterFile, mf,6);


		System.out.println("Num Features: " + types.length);



		depReader.startReading(file);



		int num11=0;

		while(true) {

			SentenceData09 instance = depReader.getNext();
			if (instance==null) break;
	
			is.fillChars(instance, num11, _CEND);


			if (num11>options.count) break;

			num11++;
		}

		return is;//.toNativeArray();

	}

	private void registerChars(String type, String word) {
		for(int i=0;i<word.length();i++) 	mf.register(type, Character.toString(word.charAt(i)));      
	}



	public void initValues() {
		s_feat = mf.getFeatureBits(FFEATS);
		s_word = mf.getFeatureBits(WORD);
		s_type = mf.getFeatureBits(TYPE);
		s_char = mf.getFeatureBits(CHAR);
		s_pos =mf.getFeatureBits(POS);
	//	dl1.a[0] = s_type; dl1.a[1] = s_pos;
	//	for (int k = 2; k < 7; k++) dl1.a[k] = s_pos;

		d1.a0 = s_type; d1.a1 = s_feat; d1.a2= s_word;
		d2.a0 = s_type; d2.a1 = s_feat; d2.a2= s_feat; d2.a3= s_feat; d2.a4= s_feat; d2.a5= s_feat; d2.a6= s_feat;
		d3.a0 = s_type; d3.a1 = s_feat; d3.a2=  s_char; d3.a3= s_char; d3.a4= s_char; d3.a5=  s_char; d3.a6=  s_char; d3.a7= s_char;
		dp.a0 = s_type; dp.a1 = s_feat; dp.a2=  s_pos; dp.a3= s_pos; dp.a4= s_feat;// dp.a5=  s_char; dp.a6=  s_char; dp.a7= s_char;
		dw.a0 = s_type; dw.a1 = s_feat;dw.a2=  s_word; dw.a3= s_word; dw.a4= s_word; dw.a5=  s_word; dw.a6=  s_word; dw.a7= s_word;
		dwp.a0 = s_type; dwp.a1 = s_feat;dwp.a2= s_word ; dwp.a3= s_feat; dwp.a4= s_word; 

	}

	public static short s_feat,s_word,s_type,s_dir,s_dist,s_char,s_pos;



	/**
	 * Initialize the features types.
	 */
	public void initFeatures() {

		for(int t=0;t<62;t++) {
			mf.register(TYPE,"F"+t);			
		}
		

//		_mid = mf.register(POS, MID);
		_strp = mf.register(POS, STR);
		_endp= mf.register(POS, END);

		mf.register(WORD, STR);
		 _ewrd = mf.register(WORD, END);


		_CEND = mf.register(CHAR, END);




		// optional features
		mf.register(WORD,STWRD);
		mf.register(POS,STPOS);


	}


	final public void addCF(InstancesTagger is, int ic, String fs,int i, short pfeat[],short ppos[], int[] forms, int[] lemmas, long[] vs) {

		int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5];
		int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10];

		int f=1,n=0;
		short upper =0, number = 1;
		for(int k1=0;k1<fs.length();k1++){
			char c = fs.charAt(k1);
			if (Character.isUpperCase(c)) {
				if (k1==0) upper=1;
				else {
					// first char + another
					if (upper==1) upper=3;
					// another uppercase in the word
					else if (upper==0) upper=2;
				}
			}

			if (Character.isDigit(c) && k1==0) number =2 ;
			else if (Character.isDigit(c) && number==1) number = 3;
		}

		int form = forms[i];

		int len = forms.length;		
		long l;
		d1.v0 = f++; d1.v2=form; l=mf.calc3(d1); vs[n++]=mf.calc3(d1);

		d1.v0 = f++; d1.v2=is.formlc[ic][i]; vs[n++]=mf.calc3(d1);

		d3.v2=c0; d3.v3=c1; d3.v4=c2; d3.v5=c3; d3.v6=c4;
		d3.v0=f++; vs[n++]=mf.calc3(d3);
		d3.v0=f++; vs[n++]=mf.calc4(d3); 
		d3.v0=f++; vs[n++]=mf.calc5(d3); 
		d3.v0=f++; vs[n++]=mf.calc6(d3);
		d3.v0=f++; vs[n++]=mf.calc7(d3);

		if (form!=-1) {
			d3.v2=c2; d3.v3=c3; d3.v4=c4; d3.v5=c5; d3.v6=cl.getLP(form);
			d3.v0=f; vs[n++]=mf.calc6(d3); d3.v0=f+1; vs[n++]=mf.calc7(d3);
		} 
		f+=2;

		if (form>0) {
			d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3);
			d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3);
			d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3);
		}
		f+=3;

		d3.v2=e0; d3.v3=e1; d3.v4=e2; d3.v5=e3; d3.v6=e4;
		d3.v0 =f++; vs[n++]=mf.calc3(d3);	
		d3.v0 =f++; vs[n++]=l=mf.calc4(d3); vs[n++]=d3.calcs(3, upper, l); 
		d3.v0 =f++; vs[n++]=l=mf.calc5(d3); vs[n++]=d3.calcs(3, upper, l); 
		d3.v0 =f++; vs[n++]=l=mf.calc6(d3); vs[n++]=d3.calcs(3, upper, l); 		
		d3.v0 =f++; vs[n++]=l=mf.calc7(d3); vs[n++]=d3.calcs(3, upper, l); 

		if (form>0) {
			d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3);
			d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3);
			d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3);
		}
		f+=3;


		dw.v0=f++; dw.v2=i+1<len?forms[i+1]:_ewrd;dw.v3= forms[i];vs[n++]=mf.calc4(dw);
		
		if (len>i+1) {

			dw.v0=f; dw.v2= forms[i+1]; vs[n++]=mf.calc3(dw);
			d3.v0=f+1; d3.v2 =is.chars[ic][i+1][0];vs[n++]=mf.calc3(d3);
			d3.v0=f+2; d3.v2 =is.chars[ic][i+1][6];vs[n++]=mf.calc3(d3);

			d3.v2=e0; d3.v3=e1;  

			d3.v0 =f+3; d3.v4 =is.chars[ic][i+1][0];vs[n++]=mf.calc5(d3);
			d3.v0 =f+4; d3.v4 =is.chars[ic][i+1][6];vs[n++]=mf.calc5(d3);

			
			
			if (is.chars[ic][i+1][11]>1 ) { //  instance.forms[i+1].length()

				d3.v0=f+5; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; vs[n++]=mf.calc4(d3);
				d3.v0=f+6; d3.v2=is.chars[ic][i+1][6];	d3.v3=is.chars[ic][i+1][7]; vs[n++]=mf.calc4(d3);

				d3.v2=e0; d3.v3=e1;  

				d3.v0=f+7; d3.v4 = is.chars[ic][i+1][0]; d3.v5 =is.chars[ic][i+1][1]; vs[n++]=mf.calc6(d3);
				d3.v0=f+8; d3.v4 = is.chars[ic][i+1][6]; d3.v5=is.chars[ic][i+1][7]; vs[n++]=mf.calc6(d3);				

				if (forms[i+1]>0) {
					d3.v0=f+9; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3);
					d3.v0=f+10; d3.v2=is.chars[ic][i+1][6];	d3.v3=is.chars[ic][i+1][7]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3);
				} 
			} 
			
			if (forms[i+1]>0) {
				dw.v0=f+11; dw.v2= cl.getLP(forms[i+1]); dw.v3= forms[i];vs[n++]=mf.calc4(dw);
			} 
			
			if (len>i+2) {
				dw.v0=f+12; dw.v2= forms[i+2]; dw.v3 = forms[i+1];vs[n++]=mf.calc4(dw);vs[n++]=mf.calc3(dw);
//				d2.v0=f+13; d2.v2=pfeat[i+1]; d2.v3=  pfeat[i+2];	vs[n++]=mf.calc4(d2);
			//	dp.v0= f+14; dp.v2=ppos[i+1]; dp.v3=ppos[i+2]; vs[n++]=mf.calc4(dp);

			} 

			if (len>i+3) {
				dw.v0=f+14; dw.v2= forms[i+3]; dw.v3 = forms[i+2]; vs[n++]=mf.calc4(dw); vs[n++]=mf.calc3(dw);
	
			} 
		} 
		f+=16;

		// length
		d2.v0=f++; d2.v2=is.chars[ic][i][11];vs[n++]=mf.calc3(d2);


		// contains a number
		d2.v0=f++; d2.v2=number;	vs[n++]=mf.calc3(d2);
		d1.v0=f++;  d1.v2=lemmas[i];	vs[n++]=mf.calc3(d1);

		if (i!=0 &&len>i+1) {
			dw.v0=f; dw.v2=lemmas[i-1];dw.v3=lemmas[i+1];vs[n++]=mf.calc4(dw);                      
			d2.v0=f+1; d2.v2=pfeat[i-1]; d2.v3=pfeat[i+1];vs[n++]=mf.calc4(d2);                      
		} 
		f+=2;

		d2.v0= f++; d2.v2=i>=1? pfeat[i-1]:_strp; vs[n++]=mf.calc3(d2);
		dp.v0= f++; dp.v2=ppos[i]; vs[n++]=mf.calc3(dp);

		if (i>0) {
			dw.v0 = f++; dw.v2 =i>=1?  forms[i-1]:_strp; vs[n++]=mf.calc3(dw);
			dw.v0 = f++; dw.v2 = i>=1? lemmas[i-1]:_strp; vs[n++]=mf.calc3(dw);			
					
			if (len>i+1) {
//				d2.v0=f;	d2.v2= pfeat[i-1];d2.v3= pfeat[i+1]; vs[n++]=mf.calc4(d2);
		//		dp.v0= f+1; dp.v2=ppos[i-1]; dp.v3=ppos[i+1]; vs[n++]=mf.calc4(dp);

			}
			f++;
			dp.v0= f++; dp.v2=ppos[i]; dp.v3=ppos[i-1]; vs[n++]=mf.calc4(dp);
					
			if (i>1) {
				d2.v0=f++;	d2.v2=i<2?_strp:  pfeat[i-2]; vs[n++]=mf.calc3(d2);
				d2.v0=f++;	d2.v2= pfeat[i-1]; d2.v3= pfeat[i-2]; vs[n++]=mf.calc4(d2);
				
				dw.v0=f++;	dw.v2= forms[i-2]; vs[n++]=mf.calc3(dw);		
				dwp.v0=f++;   dwp.v2 = forms[i-1]; dwp.v3 =  pfeat[i-2];vs[n++]=mf.calc4(dwp);
				dwp.v0=f++;   dwp.v2 = forms[i-2]; dwp.v3 =  pfeat[i-1];vs[n++]=mf.calc4(dwp);

				if (i>2) {
					d2.v0=f++;	d2.v2=pfeat[i-3]; vs[n++]=mf.calc3(d2);
					d2.v0=f++;	d2.v2=pfeat[i-2]; d2.v3= pfeat[i-3]; vs[n++]=mf.calc4(d2);
					dw.v0=f++;   dw.v2 = forms[i-3]; dw.v3 = forms[i-2]; vs[n++]=mf.calc4(dw);
		//			dp.v0= f++; dp.v2=ppos[i-3]; dp.v3=ppos[i-2]; vs[n++]=mf.calc4(dp);
				}
			}
		}
		vs[n] = Integer.MIN_VALUE;
	}


	

	


	public int fillFeatureVectorsOne(ParametersFloat params, int w1, String form, Instances is, int n, short[] features, long[] vs) {
		double best = -1;
		int bestType=-1;

		F2SF f = new F2SF(params.parameters);
			//is.gfeats[n]
		addCF((InstancesTagger)is, n, form, w1, features,is.pposs[n], is.forms[n], is.plemmas[n], vs); 
		
		for(int t = 0; t < types.length; t++) {

			f.clear();
			int p = t<<ExtractorM.s_type;
			for(int k=0;k<vs.length;k++) {
				if (vs[k]==Integer.MIN_VALUE) break;
				if (vs[k]>=0) f.add(li.l2i(vs[k]+p));
			}
			if (f.score >best) {
				bestType=t;
				best =f.score;
			}

		}	
		return bestType;

	}

	
	
		//static ArrayList<T> todo = new ArrayList<T>();
	static SentenceData09 instance;
	
	
	public static int _FC =200;
	
	
	/**
	 * Write the lemma that are not mapped by operations
	 * @param dos
	 */
	public void writeMap(DataOutputStream dos) {

		try {
			dos.writeInt(this.form2morph.size());
			for(Entry<Integer, Integer> e : form2morph.entrySet()) {
				dos.writeInt(e.getKey());
				dos.writeInt(e.getValue());
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}



	/**
	 * Read the form-lemma mapping not read by operations
	 * @param dis
	 */
	public void readMap(DataInputStream dis) {
		try {
			int size = dis.readInt();
			for(int i =0; i<size;i++) {
				form2morph.put(dis.readInt(), dis.readInt());
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}

	
	/* (non-Javadoc)
	 * @see is2.tools.IPipe#write(java.io.DataOutputStream)
	 */
	@Override
	public void write(DataOutputStream dos) {
		try {
			cl.write(dos);
			writeMap(dos);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}


}
